# delimit ;
set more off ;
capture log close ;
clear all ;
version 16 ;

/* MASTER FILE FOR EPA WQ GRANT FINAL SP SURVEY DATA PROCESSING AND ANALYSIS */
/* AUTHOR: ROGER H. VON HAEFEN  */
/* LAST EDITED: OCTOBER 1, 2021 */


/* CREATING SAMPLING WEIGHTS */

log using $weights_log, replace text ;
log off ;

local wpop = 1111761*(1-.236) ;   /* 2019 ADULT POPULATION - https://www.census.gov/quickfacts/fact/table/wakecountynorthcarolina/PST045219        */
local gpop = 537174*(1-.221) ;    /* 2019 ADULT POPULATION - https://www.census.gov/quickfacts/fact/table/guilfordcountynorthcarolina/PST045219    */
local mpop = 1110356*(1-.233) ;   /* 2019 ADULT POPULATION - https://www.census.gov/quickfacts/fact/table/mecklenburgcountynorthcarolina/PST045219 */

log on ;
  di "Guilford:   " `gpop' ;
  di "Mecklenburg:" `mpop' ;
  di "Wake:       " `wpop' ;
  di "Total:      " (`wpop'+`gpop'+`mpop') ;
log off ;


/* ESTIMATING RESPONSE/COMPLETION RATES BY COUNTY */

use $invited_sample, replace ;

log on ;
   tab county_name ;
log off ;   
sort code ;
tempfile x ;
save `x', replace ;
clear all ;

import excel using $returned_mail, first ;
tab reason ;


/* IDENTIFY CODES WHERE AT LEAST ONE MAILING WAS NOT DELIVERED */

drop if inlist(reason,"declined to participate","no internet","technologically challenged") ;  /* MAILING WAS DELIVERED FOR THESE PEOPLE */
drop if inlist(code,"3822mb","9677kg","9388ew","8976ug","7347mv") ;                            /* THESE RESPONDENTS ACTUALLY COMPLETED SURVEY */
tab reason ;
sort code reason ;
by code : gen test = _N ;
tab test ;
tab reason if test == 1 ;
by code : drop if _n > 1 ;
keep code ;
sort code ;
merge 1:1 code using `x' ;
tab _merge ;
assert _merge ~= 1 ;
keep if _merge == 2 ;
drop _merge ;
sort code ;
save `x', replace ;

/* CHECKING FOR NONRESPONSE BIAS BY DISTANCE BANDS */

merge 1:1 code using $imputed_data ;
tab _merge ;
assert _merge ~= 2 ;
gen complete = (_merge == 3) ;
gen sample = 1 ;
sum complete sample ;
replace county = "G" if county_name == "Guilford" & county == "" ;
replace county = "M" if county_name == "Mecklenburg" & county == "" ;
replace county = "W" if county_name == "Wake" & county == "" ;

/* NOTE: CODE 6338EM RESIDES IN MECK BUT TOOK WAKE SURVEY (??) */

replace near_dist = near_dist/5280 ; /* CONVERTING FROM FEET TO MILES */
sum near_dist, detail ;
gen distance = (near_dist<0.5) + 2*(near_dist<1.5)*(near_dist>=0.5) + 3*(near_dist<3.0)*(near_dist>=1.5) + 4*(near_dist<6.0)*(near_dist>=3.0) + 5*(near_dist<100)*(near_dist>=6.0) ;
sum distance ;
sort distance ;
collapse (sum) complete sample, by(distance) ;

gen ratio = complete/sample ;
sort distance ;
log on ;
   di "COMPLETION RATES BY DISTANCE BANDS" ;
   d distance ;
   list distance ratio complete sample ;
log off ;   
local distance1 = ratio[1] ;
local distance2 = ratio[2] ;
local distance3 = ratio[3] ;
local distance4 = ratio[4] ;
local distance5 = ratio[5] ;


use `x', replace ;

merge 1:1 code using $imputed_data ;
tab _merge ;

gen complete = (_merge == 3) ;
gen sample = 1 ;
sum complete sample ;
replace county = "G" if county_name == "Guilford" & county == "" ;
replace county = "M" if county_name == "Mecklenburg" & county == "" ;
replace county = "W" if county_name == "Wake" & county == "" ;

tab county ;
sort county ;
collapse (sum) complete sample, by(county) ;

gen ratio = complete/sample ;
sort county ;
log on ;
   di "COMPLETION RATES BY COUNTY" ;
   list county ratio complete sample ;
log off ;   

local gnonr = ratio[1] ;
local mnonr = ratio[2] ;
local wnonr = ratio[3] ;

collapse (sum) complete sample ;
gen ratio = complete/sample ;
log on ;
   di "OVERALL COMPLETION RATES" ;
   list ratio complete sample ;
log off ;   


/* NOW GENERATING WEIGHTS - FOLOWING PROCEDURE OUTLINED IN LEGGETT ET AL'S MRE 2018 PAPER */


use $imputed_data, replace ;
sum if county == "G" ;
local gsamp = r(N) ;

sum if county == "M" ;
local msamp = r(N) ;

sum if county == "W" ;
local wsamp = r(N) ;

gen double wgt = . ; label var wgt "Sampling Weight" ;

/* BASE WEIGHTS */ 

replace wgt = `gpop'/`gsamp' if county == "G" ;
replace wgt = `mpop'/`msamp' if county == "M" ;
replace wgt = `wpop'/`wsamp' if county == "W" ;
total wgt if county == "G" ;
matrix define xx = e(b) ;
local wmeang = xx[1,1] ;
total wgt if county == "M" ;
matrix define xx = e(b) ;
local wmeanm = xx[1,1] ;
total wgt if county == "W" ;
matrix define xx = e(b) ;
local wmeanw = xx[1,1] ;

/* HOUSEHOLD SIZE ADJUSTMENT */

d adults ;
sum adults ;
assert adults ~= . ;
replace wgt = adults*wgt ;

total wgt if county == "G" ;
matrix define xx = e(b) ;
local ameang = xx[1,1] ;
replace wgt = (wgt/`ameang')*`wmeang' if county == "`cty'" ;
total wgt if county == "G" ;
di `wmeang' ;

total wgt if county == "M" ;
matrix define xx = e(b) ;
local ameanm = xx[1,1] ;
replace wgt = (wgt/`ameanm')*`wmeanm' if county == "M" ;
total wgt if county == "M" ;
di `wmeanm' ;

total wgt if county == "W" ;
matrix define xx = e(b) ;
local ameanw = xx[1,1] ;
replace wgt = (wgt/`ameanw')*`wmeanw' if county == "W" ;
total wgt if county == "W" ;
di `wmeanw' ;


/* NON-RESPONSE ADJUSTMENT USING DISTANCE BANDS */

di `distance1' ;
di `distance2' ;
di `distance3' ;
di `distance4' ;
di `distance5' ;
sum wgt ;
replace near_dist = near_dist/5280 ;
sum near_dist ;

replace wgt = wgt/(`distance1'*(near_dist<0.5) +                  `distance2'*(near_dist<1.5)*(near_dist>=0.5) + 
                   `distance3'*(near_dist<3.0)*(near_dist>=1.5) + `distance4'*(near_dist<6.0)*(near_dist>=3.0) + 
                   `distance5'*(near_dist<100)*(near_dist>=6.0)) ;

total wgt if county == "G" ;
matrix define xx = e(b) ;
local ameang = xx[1,1] ;
replace wgt = (wgt/`ameang')*`wmeang' if county == "G" ;
total wgt if county == "G" ;
di `wmeang' ;

total wgt if county == "M" ;
matrix define xx = e(b) ;
local ameanm = xx[1,1] ;
replace wgt = (wgt/`ameanm')*`wmeanm' if county == "M" ;
total wgt if county == "M" ;
di `wmeanm' ;

total wgt if county == "W" ;
matrix define xx = e(b) ;
local ameanw = xx[1,1] ;
replace wgt = (wgt/`ameanw')*`wmeanw' if county == "W" ;
total wgt if county == "W" ;
di `wmeanw' ;


/* RAKING USING COUNTY-LEVEL CENSUS DATA */
/* BASED ON FIVE VARIABLES:		 */
/* VAR1 = AGE65				 */
/* VAR2 = COLLEGE			 */
/* VAR3 = WHITE				 */
/* VAR4 = HISP_LATINO			 */
/* VAR5 = INC_MED			 */



log on ;
sum income age college white hisp_latino [w=wgt] ;
sum income age college white hisp_latino [w=wgt] if county == "G" ;
sum income age college white hisp_latino [w=wgt] if county == "M" ;
sum income age college white hisp_latino [w=wgt] if county == "W" ;
sum income [w=wgt], detail ;
sum income [w=wgt] if county == "G", detail ;
sum income [w=wgt] if county == "M", detail ;
sum income [w=wgt] if county == "W", detail ;
log off ;


gen inc_med = . ;
replace inc_med = (income > 53261) if county == "G" ;
replace inc_med = (income > 66641) if county == "M" ;
replace inc_med = (income > 80591) if county == "W" ;

replace inc_med = . if income ~= round(income) ;
sum inc_med if county == "G";
replace inc_med = r(mean) if inc_med == . & county == "G" ;
sum inc_med if county == "M";
replace inc_med = r(mean) if inc_med == . & county == "M" ;
sum inc_med if county == "W";
replace inc_med = r(mean) if inc_med == . & county == "W" ;


gen age65 = (age > 65) ;
replace age65 = . if age ~= round(age) ;
sum age65 if county == "G";
replace age65 = r(mean) if age65 == . & county == "G" ;
sum age65 if county == "M";
replace age65 = r(mean) if age65 == . & county == "M" ;
sum age65 if county == "W";
replace age65 = r(mean) if age65 == . & county == "W" ;

/* NOW I TURN ALL IMPUTED VALUES INTO 0/1s */

local var1 = "age65" ;
local var2 = "college" ;
local var3 = "white" ;
local var4 = "hisp_latino" ;
local var5 = "inc_med" ;

sum `var1' `var2' `var3' `var4' `var5' ;

local `var1'_G = .155/(1-.221) ;
local `var2'_G = .355 ;    /* CENSUS STAT IS FOR PERSONS 25+; I SUBTRACT 0.5 PERCENT B/C OUR DATA IS FOR 18+ */
local `var3'_G = .56 ;
local `var4'_G = .084 ;
local `var5'_G = .5 ;

local `var1'_M = .115/(1-.233) ;
local `var2'_M = .449 ;    /* CENSUS STAT IS FOR PERSONS 25+; I SUBTRACT 0.5 PERCENT B/C OUR DATA IS FOR 18+ */
local `var3'_M = .573 ;
local `var4'_M = .138 ;
local `var5'_M = .5 ;

local `var1'_W = .12/(1-.236) ;
local `var2'_W = .523 ;    /* CENSUS STAT IS FOR PERSONS 25+; I SUBTRACT 0.5 PERCENT B/C OUR DATA IS FOR 18+ */
local `var3'_W = .679 ;
local `var4'_W = .104 ;
local `var5'_W = .5 ;


preserve ;
   keep code county ;
   gen double wgt_final = 0 ;
   sort code ;
   tempfile zzz ;
   save `zzz', replace ;
restore ;

set seed 94764331 ;

local iter = 50 ;  /* NUMBER OF MONTE CARLO ITERATIONS FOR RANDOMLY IMPUTING MISSING DEMOGRAPHICS USING COUNTY-LEVEL AVERAGES */
forv a = 1/`iter' { ;

sort code ;
preserve ;

/* REPLACING IMPUTED VALUES WITH HOTDECKED VALUES - NECESSARY FOR RAKING APPROACH */

replace age65       = (rnormal() <= age65)       if !inlist(age65      ,0,1) ;
replace college     = (rnormal() <= college)     if !inlist(college    ,0,1) ;
replace white       = (rnormal() <= white)       if !inlist(white      ,0,1) ;
replace hisp_latino = (rnormal() <= hisp_latino) if !inlist(hisp_latino,0,1) ;
replace inc_med     = (rnormal() <= inc_med)     if !inlist(inc_med    ,0,1) ;

local cty_list "G M W" ;
foreach cty of local cty_list { ;

qui forv j = 1/5 { ;  /* ITERATIONS */
forv i = 1/5 { ;  /* VARIABLES */ 
	local var = "`var`i''" ;
	sum `var1' [w=wgt] if county == "`cty'" ;  local `var1'1_`cty' = r(mean) ;
	sum `var2' [w=wgt] if county == "`cty'" ;  local `var2'1_`cty' = r(mean) ;
	sum `var3' [w=wgt] if county == "`cty'" ;  local `var3'1_`cty' = r(mean) ;
	sum `var4' [w=wgt] if county == "`cty'" ;  local `var4'1_`cty' = r(mean) ;
	sum `var5' [w=wgt] if county == "`cty'" ;  local `var5'1_`cty' = r(mean) ;	
	/*di `i' ;
	di ``var1'_`cty'' "   "  ``var1'1_`cty'' ;
	di ``var2'_`cty'' "   "  ``var2'1_`cty'' ;
	di ``var3'_`cty'' "   "  ``var3'1_`cty'' ;
	di ``var4'_`cty'' "   "  ``var4'1_`cty'' ;
	di ``var5'_`cty'' "   "  ``var5'1_`cty'' ;*/
	replace wgt = wgt*(    ``var'_`cty'')/(    ``var'1_`cty'') if `var' == 1 & county == "`cty'" ;
	replace wgt = wgt*(1 - ``var'_`cty'')/(1 - ``var'1_`cty'') if `var' == 0 & county == "`cty'" ;
} ;
} ;

sum `var1' [w=wgt] if county == "`cty'" ;  local `var1'1_`cty' = r(mean) ;
sum `var2' [w=wgt] if county == "`cty'" ;  local `var2'1_`cty' = r(mean) ;
sum `var3' [w=wgt] if county == "`cty'" ;  local `var3'1_`cty' = r(mean) ;
sum `var4' [w=wgt] if county == "`cty'" ;  local `var4'1_`cty' = r(mean) ;
sum `var5' [w=wgt] if county == "`cty'" ;  local `var5'1_`cty' = r(mean) ;
log on ;
di "`cty'" ;
di "`var1':  " ``var1'_`cty'' "   "  ``var1'1_`cty'' ;
di "`var2':  " ``var2'_`cty'' "   "  ``var2'1_`cty'' ;
di "`var3':  " ``var3'_`cty'' "   "  ``var3'1_`cty'' ;
di "`var4':  " ``var4'_`cty'' "   "  ``var4'1_`cty'' ;
di "`var5':  " ``var5'_`cty'' "   "  ``var5'1_`cty'' ;
log off ;

} ;

keep code wgt ;
sort code ;
merge 1:1 code using `zzz' ;
assert _merge == 3 ;
replace wgt_final = wgt_final + wgt/`iter' ;
sort code ;
keep code wgt_final county ;
save `zzz', replace ;

restore ;

} ;



log on ;
sum income age college white hisp_latino [w=wgt] ;
sum income age college white hisp_latino [w=wgt] if county == "G" ;
sum income age college white hisp_latino [w=wgt] if county == "M" ;
sum income age college white hisp_latino [w=wgt] if county == "W" ;
sum income [w=wgt], detail ;
sum income [w=wgt] if county == "G", detail ;
sum income [w=wgt] if county == "M", detail ;
sum income [w=wgt] if county == "W", detail ;
log off ;



/* TRIMING WEIGHTS IF MORE THAN 6 TIMES INTERQUARTILE RANGE */

use `zzz', replace ;

local cty_list "G M W" ;
foreach cty of local cty_list { ;

total wgt_final if county == "`cty'" ;
matrix define xx = e(b) ; local wmean = xx[1,1] ;
sum wgt_final if county == "`cty'", detail ;
local p25 = r(p25) ;
local p50 = r(p50) ;
local p75 = r(p75) ;
gen double wgt_low  = `p50' + 6*(`p25'-`p50') if county == "`cty'" ;
gen double wgt_high = `p50' + 6*(`p75'-`p50') if county == "`cty'" ;
replace wgt_final = wgt_low  if wgt_final < wgt_low  & county == "`cty'" ;
replace wgt_final = wgt_high if wgt_final > wgt_high & county == "`cty'" ;
total wgt_final if county == "`cty'" ;
matrix define xx = e(b) ;
local amean = xx[1,1] ;
replace wgt_final = (wgt_final/`amean')*`wmean' if county == "`cty'" ;
total wgt_final if county == "`cty'" ;
di `wmean' ;
drop wgt_low wgt_high ;

} ;


log on ;
  sum wgt_final, detail ;
  sum wgt_final if county == "G", detail ;
  sum wgt_final if county == "M", detail ;
  sum wgt_final if county == "W", detail ;
  total wgt_final ;
  total wgt_final if county == "G" ;
  total wgt_final if county == "M" ;
  total wgt_final if county == "W" ;
log off ;

label var wgt_final "Final sampling weights" ;
sort code ;
compress ;
save $weights_data, replace ;


merge 1:1 code using $imputed_data ;
assert _merge == 3 ;

log on ;
  sum income age college white hisp_latino ;
  sum income age college white hisp_latino if county == "G" ;
  sum income age college white hisp_latino if county == "M" ;
  sum income age college white hisp_latino if county == "W" ;

  sum income age college white hisp_latino [w=wgt] ;
  sum income age college white hisp_latino [w=wgt] if county == "G" ;
  sum income age college white hisp_latino [w=wgt] if county == "M" ;
  sum income age college white hisp_latino [w=wgt] if county == "W" ;
log close ;
